% Data-Driven Identification of Prognostic Tumor Subpopulations Using 
% Spatially Mapped t-SNE of Mass Spectrometry Imaging Data
% ===============NOTES:=====================
% Note: Some parts of this pipeline use R implementation. Please, install R and a
% proper toolbox to connect R-and-Matlab.
% Note: We use the R Package samr to perform the SAM analysis, and it is integrated
% in this Matlab scripts (calling R from Matlab). So  the results may not be 
% prinited in a proper readable manner for you, then you can find the exciplicit 
% R implementation the attached file "SAM_GastricCancer_R.txt" 
% (if it is needed, then change the value of parameter delta (here, delta is corresponding to FDR <0.001))
% ******************** Terms and Conditions ***********************
% Please note that by using this implementation you agree on the terms of
% use:
%   1. The software may be used for research purposes only.
%   2. The software is for personal use only, and May not be redistributed.
%   3. In no event Shall the LUMC be liable for any direct or indirect damage, 
%      Arising in any way out of the use of this software.
%   4. Any publication Arising from the use of this implementation should
%   cite reference : W. Abdelmoula et al, "Data-Driven Identification of Prognostic Tumor Subpopulations Using
%                   Spatially Mapped t-SNE of Mass Spectrometry Imaging Data", PNAS, 2016.
%       
% =========================================================================
% Written By Walid M. Abdelmoula, LUMC, NL


%% Load dataset and include required functions
clear all, clc
load('gastric_cancer_dataset.mat');
addpath('../Matlab_Files/Functions/');
addpath('../Matlab_Files/Functions/kaplanmeier_plot/');
addpath('../Matlab_Files/RviaMatlab/');

%% 1. Dimensionality Reduction:
% Load Data and apply tSNE (results visualized in scatter and spatial spaces)
S = size(MSI_data_cube);
N_Patients = length(Clinical_data);
N_Masses = S(3);
Folded_Measurements = reshape(MSI_data_cube, [S(1)*S(2) S(3)]); 
Tumors_All = Folded_Measurements((goodlist>0),:);
indx = find((goodlist>0));
mappedX1_3D= fast_tsne_seed(Tumors_All,3);
% Repeat the tSNE with the previous initialization map: for reproducibility
% and global convergence
mappedX2_3D = fast_tsne_seed(Tumors_All, 3, [], [], [], [], mappedX1_3D);
lab_Coloring = embedding2LABcolormap(mappedX2_3D);
% Scatter visualization
figure,scatter3(mappedX2_3D(:,1),mappedX2_3D(:,2),mappedX2_3D(:,3),3,lab_Coloring);
% tSNE spatial image (colored using L*a*b* color system)
tSNE_LABSegmentationMap = Visualize_tSNE_2DImage(lab_Coloring,S(1),S(2),indx);
%% 2. Discretization: Edge detection on tSNE spatial image and bisecting k-means images
% Find the optimal number of clusters from the tSNE-spatial image using Bisecting Kmeans.
% Note: In this run, we set k-iterations 2:10, however, you can change it in this function (Optimal_NumberClusters)
[K_Clusters, Corr_Values, IDX, C] = Optimal_NumberClusters(mappedX2_3D,S,indx);
[Ranked_Correlation, Ranked_Index] = sort(Corr_Values,'descend');
%% 3. Select the first ranked peak (i.e. optimum number of clusters)
Rank_Order = 1;
K_ranked = Ranked_Index(Rank_Order)
opt=[1E-6 1 1];
[rIDX,rC,rCovMat,rDmat,rCo]=MyKmeans(mappedX2_3D,K_ranked,opt); % Bisecting K-means
% Spatial k-means image
[Kmeans_SegmentationMap, IDXs, Cs, Color_Scheme] = Visualize_combined_ClusteredImage(rIDX,rC,S,indx,K_ranked);

%% 4. Link to clinical data
% --assign cluster to tissue's subpopulations:
% assign_regions applies a threshold to check whether a subpopulation from
% a patient will be considered for further analysis to be associated with
% clinical analysis or it will be neglected (threshold = (1/k)*100%)
[sample_to_component, pixel_to_component, IDXs_Thresholded] = assign_regions(IDXs,K_ranked, pixel_to_sample_ID(goodlist == 1));
nr_comps = length(unique(IDXs));
%K-means spatial image:
SelectedCluster_Mask = Visualize_tSNE_2DImage(IDXs_Thresholded==2,S(1),S(2),indx);
% tSNE spatial image: LAB Color Space
tSNE_LABSegmentationMap = Visualize_tSNE_2DImage(lab_Coloring,S(1),S(2),indx);
% Define a colormap: 
[RGB_COLORS_OfClusters,colstr] = RGB_Color_Ncomponents(nr_comps);
plots_dir = pwd;
figure, scatter3(mappedX2_3D(:,1),mappedX2_3D(:,2),mappedX2_3D(:,3),3,IDXs); 
colormap(RGB_COLORS_OfClusters);grid off %color tSNE based on clusters
% Run Kaplan-Meier Analysis: Survival analysis
% Note: Make sure that you have set a connection to call R via Matlab (before run this survival analysis).
statistical_analyses_gastric_cancer;
%% 4.1  Investigate survivals between two groups: Statistical significance
% assign cluster ID that is associated with either good or bad survivals
% Note: Look at surv plot (KM_plot_k.png) before run this function to
% set the below IDs of good and poor survivals (ID_goodSurv & ID_lowSurv)
close;
[ColorMap, colstr] = MyColorMap(nr_comps); 
DataSetName = strcat('jdata_',num2str(nr_comps),'.txt');
ID_goodSurv = 1; % means cluster#1 has a good survival
ID_lowSurv = 2;  % means cluster#2 has a bad survival
[Full_goodSurv, Full_badSurv] = InvestigateSurvTwoGroups(ID_goodSurv,ID_lowSurv,IDXs,S,indx,DataSetName,ColorMap);
%% 5. Prognostic Features: SAM method
% SelectedSubpop: represents the tumor subpopulations we are interested to retrieve its prognostic signature
% Look at surv plot resulted from step#4 (0_KM_plot_k_.png) before run this function
% 0_KM_plot_k_.png will help you to choose which clusterID is the
% Low Survival(Low_Surv) and which one is for the High Survivals (High_Surv)
clear MZ_Average_ConcatenatedSubpop 
IDXs_Values = IDXs_Thresholded;
SelectedSubpop = unique(IDXs);
MultiClass_MultiLabeling = 0;
Triple_Labeling = 1; %(i.e. Poor, Medium, and High Survivals)
% Triple_Labeling = 0; %(i.e. focus on prognostic signature only between Poor and High Survivals)
Low_Surv = 2;   %This is the subpop of interest in Binary Labeling
High_Surv = 1;
indxoo = indx;
SAM_FullData %(You can also run the attached R implementation "PrognosticValues_R_Gastric.txt"; Look at the delta variable)
%% 6.=========== Classification after performing SAM ===========
% SAM method will give a feature vector of m/z features (based on the
% threshold "delta" that was set to the lowest FDR, look at "PrognosticValues_R_Gastric.txt")
Mzs = [3374, 3445, 3409, 3482, 3516, 4967, 3711,14021, 7009, 4940]; % Multi-Class SAM AT K=3

%Build aclassifier with three labels (1: low, 2: medium, 3: high) survivals
clear NewLabels_IDXs
NewLabels_IDXs = IDXs;
Low_Surv = 2;
High_Surv = 1;
% High_Surv = [1,3]; %Multi-group (i.e. 1&3 are subpopulations with high surv)
Low_i = find(NewLabels_IDXs == Low_Surv);
High_i = find(ismember(NewLabels_IDXs,High_Surv)==1);
All_Indices = [1:length(NewLabels_IDXs)]';
Medium_i = setdiff(All_Indices, [Low_i;High_i]);

NewLabels_IDXs(Low_i) = 1; % Low survivals labeled as 1
NewLabels_IDXs(Medium_i) = 2; % Medium survivals labeled as 2
NewLabels_IDXs(High_i) = 3; % High survivals labeled as 1
%=========Classification============
%(Prediction result in a variable called: Percentage_TestLabels_ALL)
% KNN classifier is used and the trained model asses using 10-fold cv
Classification_FullData  %FullData means here Full spectra
